import os
from git import Repo
import dimcat as dc
from ms3 import __version__ as ms3_version
corpus_path = "~/romantic_piano_corpus"
repo = Repo(corpus_path)
notebook_repo = Repo('.', search_parent_directories=True)
notebook_repo_path = notebook_repo.git.rev_parse("--show-toplevel")
print(f"Notebook repository '{os.path.basename(notebook_repo_path)}' @ {notebook_repo.commit().hexsha[:7]}")
print(f"Data repo '{os.path.basename(corpus_path)}' @ {repo.commit().hexsha[:7]}")
print(f"dimcat version {dc.__version__}")
print(f"ms3 version {ms3_version}")
Notebook repository 'dimcat' @ 10cf791
Data repo 'romantic_piano_corpus' @ c3ac88c
dimcat version 0.2.0.post1.dev109+g2332fcf.d20230210
ms3 version 1.2.3
from fractions import Fraction
import ms3
from IPython.display import HTML
from git import Repo
import plotly.express as px
import colorlover
import pandas as pd
pd.set_option("display.max_columns", 100)
STD_LAYOUT = {
 'paper_bgcolor': '#FFFFFF',
 'plot_bgcolor': '#FFFFFF',
 'margin': {'l': 40, 'r': 0, 'b': 0, 't': 40, 'pad': 0},
 'font': {'size': 15}
}
#OUTPUT_DIR = "/home/hentsche/Documents/phd/romantic_piano_corpus_report/figures/"
OUTPUT_DIR = os.path.join(corpus_path, 'figures')
os.makedirs(OUTPUT_DIR, exist_ok=True)
#HTML(colorlover.to_html(colorlover.scales))
HTML(colorlover.to_html(colorlover.scales['9']['qual']['Paired']))
fig = px.colors.qualitative.swatches()
fig.show()
corpus_color_scale = px.colors.qualitative.D3

Overview

dataset = dc.Dataset(directory=corpus_path)
dataset.data
[default|all]
All corpora
-----------
View: This view is called 'default'. It
	- excludes fnames that are not contained in the metadata,
	- filters out file extensions requiring conversion (such as .xml), and
	- excludes review files and folders.

                               has   active   scores measures           notes        expanded
                          metadata     view detected detected parsed detected parsed detected parsed
corpus
beethoven_piano_sonatas        yes  default       87       87     87       87     87       64     64
chopin_mazurkas                yes  default       55       55     55       55     55       55     55
debussy_suite_bergamasque      yes  default        4        4      4        4      4        4      4
dvorak_silhouettes             yes  default       12       12     12       12     12       12     12
grieg_lyric_pieces             yes  default       66       66     66       66     66       66     66
liszt_pelerinage               yes  default       19       19     19       19     19       19     19
medtner_tales                  yes  default       19       19     19       19     19       19     19
schumann_kinderszenen          yes  default       13       13     13       13     13       13     13
tchaikovsky_seasons            yes  default       12       12     12       12     12       12     12

824/2236 files are excluded from this view.

792 files have been excluded based on their subdir.
32 files have been excluded based on their file name.


There are 2 orphans that could not be attributed to any of the respective corpus's fnames.
all_metadata = dataset.data.metadata()
print(f"Concatenated 'metadata.tsv' files cover {len(all_metadata)} of the {dataset.data.n_pieces} scores.")
all_metadata.groupby(level=0).nth(0)
Concatenated 'metadata.tsv' files cover 287 of the 287 scores.
TimeSig KeySig last_mc last_mn length_qb last_mc_unfolded last_mn_unfolded length_qb_unfolded volta_mcs all_notes_qb n_onsets n_onset_positions guitar_chord_count form_label_count label_count annotated_key harmony_version annotators reviewers composed_start composed_end composed_source composer workTitle movementNumber movementTitle workNumber poet lyricist arranger copyright creationDate mscVersion platform source translator title_text subtitle_text lyricist_text composer_text musescore ms3_version subdirectory rel_path has_drumset ambitus imslp musicbrainz viaf wikidata originalFormat staff_1_ambitus staff_1_instrument staff_2_ambitus staff_2_instrument score_integrity imslp.1 key mode typesetter text pdf score integrity comments staff_3_ambitus staff_3_instrument PDF staff_4_ambitus staff_4_instrument
corpus
beethoven_piano_sonatas 1: 2/2 1: -4 154 152 608.0 308.0 304.0 1216.0 NaN 1476.00 1679 985 0 0 241 f 2.3.0 Lars & Ya-Chuan (2.2.0), John Heilig (2.3.0) AN 1793 1795 OxfordMusicOnline Ludwig van Beethoven Sonata no. 1 1 Allegro op.2/1 NaN NaN NaN NaN 2019-03-05 3.02 Apple Macintosh NaN NaN Sonata no. 1 1. Allegro NaN Ludwig van Beethoven 3.6.2 1.1.1 MS3 MS3/01-1.mscx False 32-89 (Ab1-F6) https://imslp.org/wiki/Klaviersonaten_(Beethov... https://musicbrainz.org/work/a78520e0-0211-3b5... https://viaf.org/viaf/179625665 https://www.wikidata.org/wiki/Q145813 xml 51-89 (Eb3-F6) piano 32-73 (Ab1-Db5) piano NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
chopin_mazurkas 1: 3/4 1: 2 65 64 193.0 65.0 64.0 193.0 NaN 711.00 810 274 0 0 116 b 2.3.0 Wendelin Bitzan (1.0.0), Adrian Nagel (2.2.0),... JH, AN, DK 1837 1837 OxfordMusicOnline Frédéric Chopin Mazurkas 2 NaN Op. 30 NaN NaN NaN NaN 2019-02-08 3.02 Apple Macintosh https://github.com/craigsapp/chopin-mazurkas NaN Mazurkas, Op. 30 Mazurka in b, Op. 30, no. 2 NaN Frédéric Chopin 3.6.2 1.1.1 MS3 MS3/BI105-2op30-2.mscx False 35-90 (B1-F#6) https://imslp.org/wiki/Mazurkas%2C_Op.30_(Chop... https://musicbrainz.org/work/13e317ea-5e50-3d5... NaN https://www.wikidata.org/wiki/Q6799054 xml 59-90 (B3-F#6) piano 35-71 (B1-B4) piano Cédric Koller NaN B minor / F sharp minor NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
debussy_suite_bergamasque 1: 4/4 1: -1 89 89 356.0 89.0 89.0 356.0 NaN 1533.67 1721 870 0 0 274 F 2.3.0 Adrian Nagel (2.1.1), Amelia Brey (2.3.0) AB, AN 1890 1905 Oxford Music Online Claude Debussy Suite Bergamasque 1 Prelude L.75 NaN NaN NaN NaN 2015-05-19 3.02 Microsoft Windows http://musescore.com/score/890041 NaN Suite Bergamasque 1. Prelude NaN Claude Debussy 3.6.2 1.1.1 MS3 MS3/l075-01_suite_prelude.mscx False 24-94 (C1-Bb6) https://imslp.org/wiki/Suite_bergamasque_(Debu... https://musicbrainz.org/work/fe4cfa64-156a-3d7... https://viaf.org/viaf/177398380 https://www.wikidata.org/wiki/Q29117932 NaN 48-94 (C3-Bb6) Piano 24-90 (C1-F#6) Piano NaN NaN NaN NaN NaN <b>Prélude</b> NaN NaN NaN NaN NaN NaN NaN NaN
dvorak_silhouettes 1: 6/8 1: 4, 7: -5, 49: 4 54 52 156.5 54.0 52.0 156.5 NaN 658.75 957 288 0 0 80 c# 2.3.0 Daniel Grote (2.1.1), Hanné Becker (2.3.0) Johannes Hentschel (2.1.1), AN 1875 1879 OxfordMusicOnline Antonín Dvořák Silhouettes 1 Allegro feroce op. 8 NaN NaN NaN NaN 2018-05-26 3.02 Microsoft Windows NaN NaN Silhouettes, op. 8 1. Allegro feroce NaN Antonín Dvořák 3.6.2 1.1.1 MS3 MS3/op08n01.mscx False 32-92 (G#1-Ab6) https://imslp.org/wiki/Silhouettes%2C_Op.8_(Dv... https://musicbrainz.org/work/80bb714e-a36a-425... https://viaf.org/viaf/174794325/ NaN xml 56-92 (G#3-Ab6) Piano 32-68 (G#1-G#4) Piano NaN NaN NaN NaN NaN NaN https://imslp.org/wiki/Special:ReverseLookup/5... Tom Schreyer NaN NaN NaN NaN NaN NaN
grieg_lyric_pieces 1: 2/4 1: -3 23 23 46.0 23.0 23.0 46.0 NaN 135.50 268 156 0 0 43 Eb 2.3.0 Adrian Nagel (2.1.1), John Heilig (2.30) Adrian Nagel 1864 1867 OxfordMusicOnline Edvard Grieg Lyric Pieces 1 Arietta Op. 12 NaN NaN NaN NaN 2018-10-11 3.02 Microsoft Windows NaN NaN Lyric Pieces, Op. 12 1. Arietta NaN Edvard Grieg 3.6.2 1.1.1 MS3 MS3/op12n01.mscx False 39-79 (Eb2-G5) https://imslp.org/wiki/Lyric_Pieces,_Op.12_(Gr... https://musicbrainz.org/work/b6115546-141a-336... NaN https://www.wikidata.org/wiki/Q2304758 mxl 55-79 (G3-G5) NaN 39-71 (Eb2-Cb5) NaN Tom Schreyer NaN NaN NaN NaN NaN https://imslp.eu/files/imglnks/euimg/8/8e/IMSL... NaN NaN NaN NaN NaN NaN NaN
liszt_pelerinage 1: 4/4 1: 0 97 97 388.0 97.0 97.0 388.0 NaN 1902.42 2879 1069 0 0 174 C 2.3.0 Adrian Nagel (2.1.1), Amelia Brey (2.3.0) Johannes Hentschel (1-33 & 82-97), AB, AN 1848 1855 OxfordMusicOnline Franz Liszt Années de Pèlerinage, Première année: Suisse 1 Chapelle de Guillaume Tell S.160 NaN NaN NaN NaN 2019-01-26 3.02 Microsoft Windows https://musescore.com/score/3987861 NaN Années de Pèlerinage, Première année: Suisse, ... <font size="18"/>1. CHAPELLE DE GUILLAUME TELL... NaN Franz Liszt 3.6.2 1.1.1 MS3 MS3/160.01_Chapelle_de_Guillaume_Tell.mscx False 24-96 (C1-C7) https://imslp.org/wiki/Ann%C3%A9es_de_p%C3%A8l... https://musicbrainz.org/work/5804701d-54a6-4c9... https://viaf.org/viaf/179020308/ https://www.wikidata.org/wiki/Q567462 xml 40-96 (E2-C7) Piano 24-79 (C1-G5) Piano Tom Schreyer NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN https://imslp.org/wiki/Special:ReverseLookup/1... NaN NaN
medtner_tales 1: 4/8 1: -3 81 81 162.0 81.0 81.0 162.0 NaN 603.00 1481 528 0 0 213 c 2.2.0 Wendelin Bitzan Adrian Nagel 1904 1905 OxfordMusicOnline Nikolai Medtner Tales 1 Andantino op.8 NaN NaN NaN NaN 2017-10-21 3.02 Apple Macintosh NaN NaN Tales, op.8 1. Andantino NaN Nikolai Medtner 3.6.2 1.1.1 MS3 MS3/op08n01.mscx False 22-87 (Bb0-Eb6) https://imslp.org/wiki/2_Tales%2C_Op.8_(Medtne... https://musicbrainz.org/work/0cdc7080-06b8-33d... https://viaf.org/viaf/2467165271623010690003 NaN NaN 47-87 (B2-Eb6) Piano 22-77 (Bb0-F5) Piano Tom Schreyer NaN NaN NaN NaN NaN https://imslp.org/wiki/Special:ReverseLookup/5790 NaN NaN NaN NaN NaN NaN NaN
schumann_kinderszenen 1: 2/4 1: 1 22 22 44.0 44.0 44.0 88.0 NaN 134.33 241 141 0 0 44 G 2.3.0 Tal Soker (2.1.1), John Heilig (2.3.0) AN, JHei, JH 1838 1839 OxfordMusicOnline Robert Schumann Kinderszenen 1 Von fremden Ländern und Menschen Op.15 NaN NaN NaN NaN 2017-03-11 3.02 Microsoft Windows http://musescore.com/user/22249306/scores/4778176 NaN Von fremden Ländern und Menschen\n(Of Foreign ... NaN NaN Robert Schumann 3.6.2 1.1.1 MS3 MS3/n01.mscx False 42-79 (F#2-G5) https://imslp.org/wiki/Kinderszenen,_Op.15_(Sc... https://musicbrainz.org/work/04bf8808-7a43-30e... https://viaf.org/viaf/174865068/ https://www.wikidata.org/wiki/Q1569982 NaN 62-79 (D4-G5) Piano 42-69 (F#2-A4) Piano Tom Schreyer NaN NaN NaN NaN NaN https://imslp.org/wiki/Special:ReverseLookup/6... NaN NaN NaN NaN NaN NaN NaN
tchaikovsky_seasons 1: 3/4 1: 3, 29: 1, 63: 3 103 103 309.0 103.0 103.0 309.0 NaN 1058.17 1537 829 0 0 313 A 2.3.0 Adrian Nagel (2.1.1), John Heilig (2.3.0) Johannes Hentschel, AN 1875 1876 OxfordMusicOnline Pyotr Ilyich Tchaikovsky The Seasons 1 January: At the Fireside Op. 37a NaN NaN NaN NaN 11/29/18 3.02 Linux http://musescore.com/user/12839876/scores/3444321 NaN 1. January: At the Fireside from: <i>The Seasons</i>, op. 37a «И мирной неги уголок\nНочь сумраком одела,\nВ... Pyotr Ilyich Tchaikovsky 3.6.2 1.1.1 MS3 MS3/op37a01.mscx False 33-88 (A1-E6) https://imslp.org/wiki/The_Seasons,_Op.37a_(Tc... https://musicbrainz.org/work/6460a645-9844-304... https://viaf.org/viaf/183857288 https://www.wikidata.org/wiki/Q2914902 mxl 53-88 (E#3-E6) Piano 33-88 (A1-E6) Piano NaN NaN NaN NaN NaN NaN https://imslp.org/wiki/Special:ReverseLookup/1... Tom Schreyer NaN NaN NaN NaN NaN NaN
annotated = dc.IsAnnotatedFilter().process_data(dataset)
print(f"Before: {dataset.n_indices} IDs, after filtering: {annotated.n_indices}")
Before: 287 IDs, after filtering: 264

Choose here if you want to see stats for all or only for annotated scores.

#selected = dataset
selected = annotated

Compute chronological order

summary = all_metadata[all_metadata.label_count > 0]
print(f"Selected metadata rows cover {len(summary)} of the {selected.n_indices} scores.")
mean_composition_years = summary.groupby(level=0).composed_end.mean().astype(int).sort_values()
chronological_order = mean_composition_years.index.to_list()
dataset_colors = dict(zip(chronological_order, corpus_color_scale))
chronological_order
Selected metadata rows cover 264 of the 264 scores.
['beethoven_piano_sonatas',
 'chopin_mazurkas',
 'schumann_kinderszenen',
 'liszt_pelerinage',
 'tchaikovsky_seasons',
 'dvorak_silhouettes',
 'grieg_lyric_pieces',
 'debussy_suite_bergamasque',
 'medtner_tales']

Notes

all_notes = selected.get_facet('notes')
print(f"{len(all_notes.index)} notes over {len(all_notes.groupby(level=[0,1]))} files.")
all_notes.head()
434219 notes over 264 files.
mc mn quarterbeats duration_qb mc_onset mn_onset timesig staff voice duration gracenote nominal_duration scalar tied tpc midi name octave chord_id volta tremolo
corpus fname interval
beethoven_piano_sonatas 01-1 [0.0, 1.0) 1 0 0 1.0 0 3/4 2/2 1 1 1/4 NaN 1/4 1 <NA> 0 60 C4 4 0 <NA> NaN
[1.0, 2.0) 2 1 1 1.0 0 0 2/2 1 1 1/4 NaN 1/4 1 <NA> -1 65 F4 4 1 <NA> NaN
[2.0, 3.0) 2 1 2 1.0 1/4 1/4 2/2 1 1 1/4 NaN 1/4 1 <NA> -4 68 Ab4 4 2 <NA> NaN
[3.0, 4.0) 2 1 3 1.0 1/2 1/2 2/2 1 1 1/4 NaN 1/4 1 <NA> 0 72 C5 5 3 <NA> NaN
[4.0, 5.0) 2 1 4 1.0 3/4 3/4 2/2 1 1 1/4 NaN 1/4 1 <NA> -1 77 F5 5 4 <NA> NaN
def weight_notes(nl, group_col='midi', precise=True):
    summed_durations = nl.groupby(group_col).duration_qb.sum()
    summed_durations /= summed_durations.min() # normalize such that the shortest duration results in 1 occurrence
    if not precise:
        # This simple trick reduces compute time but also precision:
        # The rationale is to have the smallest value be slightly larger than 0.5 because
        # if it was exactly 0.5 it would be rounded down by repeat_notes_according_to_weights()
        summed_durations /= 1.9999999
    return repeat_notes_according_to_weights(summed_durations)

def repeat_notes_according_to_weights(weights):
    counts = weights.round().astype(int)
    counts_reflecting_weights = []
    for pitch, count in counts.items():
        counts_reflecting_weights.extend([pitch]*count)
    return pd.Series(counts_reflecting_weights)
dataset_names = dict(
    beethoven_piano_sonatas='Beethoven Sonatas',
    chopin_mazurkas='Chopin Mazurkas',
    debussy_suite_bergamasque='Debussy Suite',
    dvorak_silhouettes="Dvořák Silhouettes",
    grieg_lyric_pieces="Grieg Lyric Pieces",
    liszt_pelerinage="Liszt Années",
    medtner_tales="Medtner Tales",
    schumann_kinderszenen="Schumann Kinderszenen",
    tchaikovsky_seasons="Tchaikovsky Seasons"
)
dataset_name_colors = {dataset_names[corp]: color for corp, color in dataset_colors.items()}
chronological_dataset_names = [dataset_names[corp] for corp in chronological_order]
all_notes['dataset_name'] = all_notes.index.get_level_values(0).map(dataset_names)
grouped_notes = all_notes.groupby('dataset_name')
weighted_midi = pd.concat([weight_notes(nl, 'midi', precise=False) for _, nl in grouped_notes], keys=grouped_notes.groups.keys()).reset_index(level=0)
weighted_midi.columns = ['dataset', 'midi']
weighted_midi
dataset midi
0 Beethoven Sonatas 24
1 Beethoven Sonatas 24
2 Beethoven Sonatas 24
3 Beethoven Sonatas 24
4 Beethoven Sonatas 24
... ... ...
13639 Tchaikovsky Seasons 91
13640 Tchaikovsky Seasons 91
13641 Tchaikovsky Seasons 92
13642 Tchaikovsky Seasons 92
13643 Tchaikovsky Seasons 93

527810 rows × 2 columns

yaxis=dict(tickmode= 'array',
           tickvals= [12, 24, 36, 48, 60, 72, 84, 96],
           ticktext = ["C0", "C1", "C2", "C3", "C4", "C5", "C6", "C7"],
           gridcolor='lightgrey',
           )
fig = px.violin(weighted_midi, x='dataset', y='midi', color='dataset', box=True,
                labels=dict(
                    dataset='',
                    midi='distribution of pitches by duration'
                ),
                category_orders=dict(dataset=chronological_dataset_names),
                color_discrete_map=dataset_name_colors,
                width=1000, height=600,
               )
fig.update_traces(spanmode='hard') # do not extend beyond outliers
fig.update_layout(yaxis=yaxis, **STD_LAYOUT,
                 showlegend=False)
fig.write_image(os.path.join(OUTPUT_DIR, "ambitus_per_dataset_colored.png"), scale=2)
fig.show()
bar_data = all_notes.groupby('tpc').duration_qb.sum().reset_index()
bar_data.to_csv('romantic_tpc_profile.tsv.zip', sep='\t')
x_values = list(range(bar_data.tpc.min(), bar_data.tpc.max()+1))
x_names = ms3.fifths2name(x_values)
fig = px.bar(bar_data, x='tpc', y='duration_qb',
                 labels=dict(tpc='Named pitch class',
                             duration_qb='Duration in quarter notes'
                            ),
             color_discrete_sequence=corpus_color_scale,
             width=1000, height=300,
            )
fig.update_layout(**STD_LAYOUT)
fig.update_yaxes(gridcolor='lightgrey')
fig.update_xaxes(gridcolor='lightgrey', zerolinecolor='grey', tickmode='array',
                 tickvals=x_values, ticktext = x_names, dtick=1, ticks='outside', tickcolor='black',
                 minor=dict(dtick=6, gridcolor='grey', showgrid=True),
                )
fig.write_image(os.path.join(OUTPUT_DIR, "tpc_distribution_overall.png"), scale=2)
fig.show()
scatter_data = all_notes.groupby(['dataset_name', 'tpc']).duration_qb.sum().reset_index()
fig = px.scatter(scatter_data, x='tpc', y='duration_qb', color='dataset_name',
                 labels=dict(
                     duration_qb='duration',
                     tpc='named pitch class',
                 ),
                 category_orders=dict(dataset=chronological_dataset_names),
                 color_discrete_map=dataset_name_colors,
                 facet_col='dataset_name', facet_col_wrap=3, facet_col_spacing=0.03,
                 width=1000, height=500,
                )
fig.update_traces(mode='lines+markers')
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig.update_layout(**STD_LAYOUT, showlegend=False)
fig.update_xaxes(gridcolor='lightgrey', zerolinecolor='lightgrey', tickmode='array', tickvals= [-12, -6, 0, 6, 12, 18],
    ticktext = ["Dbb", "Gb", "C", "F#", "B#", "E##"], visible=True, )
fig.update_yaxes(gridcolor='lightgrey', zeroline=False, matches=None, showticklabels=True)
fig.write_image(os.path.join(OUTPUT_DIR, "tpc_line_per_dataset_compact.png"), scale=2)
fig.show()
px.bar(scatter_data, x='tpc', y='duration_qb', color='dataset_name',
                 labels=dict(
                     duration_qb='duration',
                     tpc='named pitch class',
                 ),
                 category_orders=dict(dataset=chronological_dataset_names),
                 color_discrete_map=dataset_name_colors,
                 width=1000, height=500,
                )
no_accidental = bar_data[bar_data.tpc.between(-1,5)].duration_qb.sum()
with_accidental = bar_data[~bar_data.tpc.between(-1,5)].duration_qb.sum()
entire = no_accidental + with_accidental
f"Fraction of note duration without accidental of the entire durations: {no_accidental} / {entire} = {no_accidental / entire}"
'Fraction of note duration without accidental of the entire durations: 182796.29129689754 / 291874.3118386243 = 0.6262842733414806'

Notes and staves

print("Distribution of notes over staves:")
all_notes.staff.value_counts()
Distribution of notes over staves:
1    230221
2    200617
3      2397
4       984
Name: staff, dtype: Int64
print("Distribution of notes over staves for all pieces with more than two staves\n")
for group, df in all_notes.groupby(level=[0,1]):
    if (df.staff > 2).any():
        print(group)
        print(df.staff.value_counts().to_dict())
Distribution of notes over staves for all pieces with more than two staves

('grieg_lyric_pieces', 'op43n06')
{2: 769, 3: 422, 1: 180}
('liszt_pelerinage', '161.04_Sonetto_47_del_Petrarca')
{1: 1076, 2: 628, 3: 42, 4: 29}
('liszt_pelerinage', '161.07_Apres_une_lecture_du_Dante')
{1: 6638, 2: 5181, 3: 50}
('liszt_pelerinage', '162.01_Gondoliera')
{3: 1745, 4: 955}
('medtner_tales', 'op34n03')
{1: 1219, 2: 816, 3: 89}
('medtner_tales', 'op35n04')
{1: 1678, 2: 1632, 3: 49}
all_notes[all_notes.staff > 2].groupby(level=[0,1]).staff.value_counts()
corpus              fname                              staff
grieg_lyric_pieces  op43n06                            3         422
liszt_pelerinage    161.04_Sonetto_47_del_Petrarca     3          42
                                                       4          29
                    161.07_Apres_une_lecture_du_Dante  3          50
                    162.01_Gondoliera                  3        1745
                                                       4         955
medtner_tales       op34n03                            3          89
                    op35n04                            3          49
Name: staff, dtype: int64